{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up environment variables and load necessary libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROJECT = \"qwiklabs-gcp-da02053fb2a13c97\"  # Replace with your PROJECT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"PROJECT\"] = PROJECT\n",
    "#os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = 'service_account_key.json' # for local ONLY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_query(phase, sample_size):\n",
    "    basequery = \"\"\"\n",
    "    SELECT\n",
    "        fare_amount,\n",
    "        EXTRACT(DAYOFWEEK from pickup_datetime) AS dayofweek,\n",
    "        EXTRACT(HOUR from pickup_datetime) AS hourofday,\n",
    "        pickuplon,\n",
    "        pickuplat,\n",
    "        dropofflon,\n",
    "        dropofflat,\n",
    "        trips_last_5min\n",
    "    FROM\n",
    "        `taxifare.traffic`\n",
    "    WHERE\n",
    "        trip_distance > 0\n",
    "        AND fare_amount >= 2.5\n",
    "        AND pickuplon > -78\n",
    "        AND pickuplon < -70\n",
    "        AND dropofflon > -78\n",
    "        AND dropofflon < -70\n",
    "        AND pickuplat > 37\n",
    "        AND pickuplat < 45\n",
    "        AND dropofflat > 37\n",
    "        AND dropofflat < 45\n",
    "        AND passenger_count > 0\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N)) = 1\n",
    "    \"\"\"\n",
    "\n",
    "    if phase == \"TRAIN\":\n",
    "        subsample = \"\"\"\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) >= (EVERY_N * 0)\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) <  (EVERY_N * 70)\n",
    "        \"\"\"\n",
    "    elif phase == \"VALID\":\n",
    "        subsample = \"\"\"\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) >= (EVERY_N * 70)\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) <  (EVERY_N * 85)\n",
    "        \"\"\"\n",
    "    elif phase == \"TEST\":\n",
    "        subsample = \"\"\"\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) >= (EVERY_N * 85)\n",
    "        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) <  (EVERY_N * 100)\n",
    "        \"\"\"\n",
    "\n",
    "    query = basequery + subsample\n",
    "    return query.replace(\"EVERY_N\", sample_size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write to CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google.cloud import bigquery\n",
    "bq = bigquery.Client(project=PROJECT)\n",
    "\n",
    "for phase in [\"TRAIN\", \"VALID\", \"TEST\"]:\n",
    "    # 1. Create query string\n",
    "    query_string = create_query(phase, \"5000\")\n",
    "    # 2. Load results into DataFrame\n",
    "    df = bq.query(query_string).to_dataframe()\n",
    "\n",
    "    # 3. Write DataFrame to CSV\n",
    "    df.to_csv(\"taxi-{}.csv\".format(phase.lower()), index_label = False, index = False)\n",
    "    print(\"Wrote {} lines to {}\".format(len(df), \"taxi-{}.csv\".format(phase.lower())))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that even with a 1/5000th sample we have a good amount of data for ML. 150K training examples and 30K validation."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Verify that datasets exist "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -l *.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preview one of the files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head taxi-train.csv"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
